detach("package:dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nycflights13)
library(nycflights13)
dim(flights)
## [1] 336776 19
data(flights)
所生成的数据格式是tibble。他是data.frame的现代版本,具有数据框的大多数属性,但是特别适用于大型数据集的存储和表示。
filter()arrange()select()mutate()group_by() %>% summarise()%>%: 快捷键: ctrl + shift + Mdplyr::filter(flights, month ==1, day == 1)
base R 方法
flights[flights$month == 1 & flights$day == 1, ]
arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
starts_with() ends_with(), matches(), contains()使用。flights %>% select(contains('dep'))
select(flights, tail_num = tailnum)
rename(flights, tail_num = tailnum)
mutate(flights,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
mutate(flights,
gain = arr_delay - dep_delay,
gain_per_hour = gain / (air_time / 60)
)
transmute(flights,
gain = arr_delay - dep_delay,
gain_per_hour = gain / (air_time / 60)
)
summarise(flights,
delay = mean(dep_delay, na.rm = TRUE)
)
好像很多余?
group_by() 为 summarise插上翅膀。稍后单独展示。
sample_n(flights, 10)
sample_frac(flights, 0.01)
by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- dplyr::filter(delay, count > 20, dist < 2000)
使用连贯动作
delay <- flights %>%
group_by(tailnum) %>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 20, dist < 2000)
delay
flights %>%
group_by(year, month, day) %>%
select(arr_delay, dep_delay) %>%
summarise(
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)
) %>%
filter(arr > 30 | dep > 30)
## Adding missing grouping variables: `year`, `month`, `day`
daily <- group_by(flights, year, month, day)
(per_day <- summarise(daily, flights = n()))
(per_month <- summarise(per_day, flights = sum(flights)))
(per_year <- summarise(per_month, flights = sum(flights)))
flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)
flights2 %>%
left_join(airlines)
## Joining, by = "carrier"
data("weather")
weather
flights2 %>% left_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
planes
flights2 %>% left_join(planes, by = "tailnum")
airports
flights2 %>% left_join(airports, c("dest" = "faa"))
flights2 %>% left_join(airports, c("origin" = "faa"))
df1 <- tibble(x = c(1, 2), y = 2:1)
df2 <- tibble(x = c(1, 3), a = 10, b = "a")
df1
df2
inner_join(x, y): xy交集df1 %>% inner_join(df2)
## Joining, by = "x"
left_join(x, y), 保留x的所有观测。最为常见df1 %>% left_join(df2)
## Joining, by = "x"
right_join(),保留y中的所有观测,等同于left_join(y, x),但列的排序稍有不同。df1 %>% right_join(df2)
## Joining, by = "x"
df2 %>% left_join(df1)
## Joining, by = "x"
full_join() 保留x, y所有观测,用NA填充。df1 %>% full_join(df2)
## Joining, by = "x"
semi_join(x, y): 保留y中有匹配的x观测值anti_join(x, y):将x中与y有匹配的数据全部丢弃flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))
df1
df2
df1 %>% nrow()
## [1] 4
df1 %>% inner_join(df2, by = "x")
df1 %>% semi_join(df2, by = "x")
intersect(x, y): xy交集union(x, y): xy并集setdiff(x, y): x中y的补集(df1 <- tibble(x = 1:2, y = c(1L, 1L)))
(df2 <- tibble(x = 1:2, y = 1:2))
intersect(df1, df2)
union(df1, df2)
setdiff(df1, df2)
setdiff(df2, df1)
与mutate和filter结合使用,窗口函数可以大有作为
library(Lahman)
batting <- Lahman::Batting %>%
as_tibble() %>%
select(playerID, yearID, teamID, G, AB:H) %>%
arrange(playerID, yearID, teamID) %>%
semi_join(Lahman::AwardsPlayers, by = "playerID")
players <- batting %>% group_by(playerID)
players
# For each player, find the two years with most hits
filter(players, min_rank(desc(H)) <= 2 & H > 0)
# Within each player, rank each year by the number of games played
mutate(players, G_rank = min_rank(G))
# For each player, find every year that was better than the previous year
filter(players, G > lag(G))
# For each player, compute avg change in games played per year
mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID)))
# For each player, find all where they played more games than average
filter(players, G > mean(G))
# For each, player compute a z score based on number of games played
mutate(players, G_z = (G - mean(G)) / sd(G))
x <- c(1, 1, 2, 2, 2)
row_number(x)
## [1] 1 2 3 4 5
min_rank(x)
## [1] 1 1 3 3 3
dense_rank(x)
## [1] 1 1 2 2 2
cume_dist(x)
## [1] 0.4 0.4 1.0 1.0 1.0
# select the top 10% of records within each group
filter(players, cume_dist(desc(G)) < 0.1)
by_team_player <- group_by(batting, teamID, playerID)
by_team <- summarise(by_team_player, G = sum(G))
by_team_quartile <- group_by(by_team, quartile = ntile(G, 4))
summarise(by_team_quartile, mean(G))
x <- 1:5
lead(x)
## [1] 2 3 4 5 NA
lag(x)
## [1] NA 1 2 3 4
# Compute the relative change in games played
mutate(players, G_delta = G - lag(G))
# Find when a player changed teams
filter(players, teamID != lag(teamID))